library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0       ✔ purrr   0.2.5  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.1       ✔ stringr 1.3.1  
## ✔ readr   1.1.1       ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(readxl)
library(httr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(RColorBrewer)

Continuous variable

Import & format data

GET('http://www.africapolis.org/download/Africapolis_agglomeration_2015.xlsx', 
    write_disk(africapolis_impt <- tempfile(fileext = ".xls")))
## Response [http://www.africapolis.org/download/Africapolis_agglomeration_2015.xlsx]
##   Date: 2019-04-02 19:41
##   Status: 200
##   Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
##   Size: 1.65 MB
## <ON DISK>  /var/folders/k5/7wfb52hs61v78z05j3k54rq80000gn/T//RtmpS74KH5/file3cb57d1a9fec.xls
# ggplot(afr_cities, aes(x = Population_2015, y = Builtup)) +
#   geom_point()
hub_cities <- afr_cities %>%
  drop_na(closest_metro) %>%
  group_by(closest_metro) %>%
  summarize(n = n()) %>%
  arrange(desc(n)) %>%
  head(5)

hub_cities
## # A tibble: 5 x 2
##   closest_metro     n
##   <chr>         <int>
## 1 Cairo          1063
## 2 Onitsha         424
## 3 Alger           383
## 4 Kano            377
## 5 Ibadan          277
# ggplot(test, aes(x = Population_2015, y = Builtup)) +
#   theme_classic() +
#   scale_y_continuous(labels = scales::comma) +
#   scale_x_continuous(labels = scales::comma) +
#   geom_point() +
#   geom_smooth(method = 'lm', formula = y ~ splines::bs(x, 3), se = FALSE) +
#   facet_grid(rows = vars(factor(closest_metro)))
color <- brewer.pal(4, "Set1")

# set up title formatting
title <- list(
  text = "Relationship Between Job Prestige of R and Spouse",
  xref = "paper", yref = "paper",
  yanchor = "bottom", xanchor = "left",
  align = "left", x = 0, y = 1.2, 
  showarrow = FALSE
)

plots <- list()
idx <- 0
for (hub in hub_cities$closest_metro){
  idx <- idx + 1
  
  a <- list(
    text = hub_cities$closest_metro[idx],
    xref = "paper",
    yref = "paper",
    yanchor = "bottom",
    xanchor = "center",
    align = "center",
    x = 0.5,
    y = 1,
    showarrow = FALSE
  )
  
  data <- afr_cities %>%
    filter(closest_metro == hub) %>%
    filter(dist_to_metro < 500) %>%
    arrange(dist_to_metro)

  #fit <- lm(Builtup ~ dist_to_metro, data = data)
  fit <- lm(Population_2015 ~ dist_to_metro, data = data)
   
  p <- plot_ly(data, x = ~dist_to_metro, y = ~Population_2015, 
               height = 800, width = 500) %>%
    
       add_trace(type = "scatter", mode = "markers", hoverinfo = 'text',
                  text = paste("Name of town: ", data$Agglomeration_Name, "<br>", 
                           "Altitude: ", data$Altitude),
                  marker = list(size = 3, 
                                color = color[1],
                                opacity = .33), 
                  showlegend = FALSE) %>%
     
       add_trace(x = ~dist_to_metro, y = fitted(fit), mode = "lines",
                  name = "",
                  line = list(width = 2,
                              color = color[2])) %>%

      layout(yaxis = list(title = "Population",
                          range = c(3, 8),
                          type = 'log',
                          exponentformat='E',
                        zeroline = FALSE), 
           
             xaxis = list(title = "Distance from Closest Metro",
                      zeroline = FALSE), 
           
           annotations = a)   #%>%
  
  plots <- c(plots, list(p)) 

}
    
subplot(plots, nrows = nrow(hub_cities), shareX = TRUE, shareY = TRUE, titleY = TRUE, titleX = TRUE) %>%
  layout(annotations = title, showlegend = FALSE) %>%
  config(collaborate = FALSE, displaylogo = FALSE, displayModeBar = FALSE)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter

How it looks in GGplot for comparison

test <- afr_cities %>%
  filter(closest_metro %in% hub_cities$closest_metro) %>%
  filter(dist_to_metro > 0) %>%
  filter(Population_2015 < 500000)


ggplot(test, aes(x = dist_to_metro, y = Population_2015)) +
  theme_classic() +
  scale_y_log10(labels = scales::comma) +
  geom_point(color = "lightblue") +
  geom_smooth(method = 'lm', formula = y ~ splines::bs(x, 3), se = FALSE, color = "steelblue") +
  facet_grid(rows = vars(factor(closest_metro)))

#Categorical Variable From https://www.governing.com/gov-data/residential-racial-segregation-metro-areas.html

##Reading and formating the data

GET('http://images.centerdigitaled.com/documents/segregation-national-download.csv', 
    write_disk(us_cities_impt <- tempfile(fileext = ".csv")))
## Response [http://images.centerdigitaled.com/documents/segregation-national-download.csv]
##   Date: 2019-04-02 19:41
##   Status: 200
##   Content-Type: application/csv
##   Size: 28.3 kB
## <ON DISK>  /var/folders/k5/7wfb52hs61v78z05j3k54rq80000gn/T//RtmpS74KH5/file3cb5fe90d5.csv
us_cities <- read.csv(us_cities_impt)

#Separating city and state
us_cities <- separate(data = us_cities, col = MSA, into = c("City", "State"), sep = ",")
#Cities are on the border of two states and will be removed
us_cities <- us_cities[!grepl("-",us_cities$State),]
#Keeping only columns of interest
us_states <- us_cities[,c("State", "Black.Population", "White.Population..Non.Hispanic.", "Hispanic.Population", "Asian.Population")]

us_states$Black.Population <- as.numeric(gsub(",","", as.character(us_states$Black.Population)))
us_states$White.Population..Non.Hispanic. <- as.numeric(gsub(",","", as.character(us_states$White.Population..Non.Hispanic.)))
us_states$Hispanic.Population <- as.numeric(gsub(",","", as.character(us_states$Hispanic.Population)))
us_states$Asian.Population <- as.numeric(gsub(",","", as.character(us_states$Asian.Population)))

#Grouping race populations by state
us_states <- aggregate(.~ State, us_states, FUN=sum)

#Calculate total sample population of each state
us_states$TotalPop <- apply(us_states[-1],1,sum)

#Convert counts to percentages
us_states$Black.Population <- round(us_states$Black.Population/us_states$TotalPop *100, 1)
us_states$White.Population..Non.Hispanic. <- round(us_states$White.Population..Non.Hispanic./us_states$TotalPop*100, 1)
us_states$Hispanic.Population <- round(us_states$Hispanic.Population/us_states$TotalPop*100, 1)
us_states$Asian.Population <- round(us_states$Asian.Population/us_states$TotalPop*100, 1)

##Plotting racial breakdown of states from population samples

color <- brewer.pal(4, "Set1")
plot_ly(us_states, x = ~State, y = ~Black.Population, 
        type = 'bar',
        name = 'Black Population',
        marker = list(color = color[1]),
        text = paste("<b>State:</b>", us_states$State, "<br><b>Sample Population:</b>", prettyNum(us_states$TotalPop,big.mark=",",scientific=FALSE)),
        hoverinfo = "text+y") %>%
  add_trace(y = ~White.Population..Non.Hispanic., 
            name = 'White Population',
            marker=list(color = color[2])) %>%
  add_trace(y = ~Hispanic.Population, 
            name = 'Hispanic Population',
            marker=list(color = color[3])) %>%
  add_trace(y = ~Asian.Population, 
            name = 'Asian Population',
            marker=list(color = color[4])) %>%
  layout(margin = list(t = 80),
         barmode = 'stack', 
         annotations = list(text = "Racial Breakdown of Urban Areas in US States", 
                            showarrow = FALSE, 
                            font = list(size = 19), 
                            x = 0.5, 
                            xref = "paper", 
                            xanchor = "center", 
                            y = 1.2, 
                            yref = "paper"),
         yaxis = list(title = "Percent of Population",
                      ticksuffix = "%",
                      zeroline = FALSE))   %>%
  add_annotations(text = paste0("Based on Data from 273 Cities in 43 States"), 
                  showarrow = FALSE, 
                  font = list(size = 16), 
                  x = 0.5, 
                  xref = "paper", 
                  xanchor = "center", 
                  y = 1.12, 
                  yref = "paper")

##Looking at a sample of 5 random states Because there is data from 43 states, we decided to choose five random states to look at. Using this subset, we created a second chart below, which is easier to interpret.

set.seed(1)
index <- sample(1:nrow(us_states), 5)
us_sub <- us_states[index,]

plot_ly(us_sub, x = ~State, y = ~Black.Population, 
        type = 'bar',
        name = 'Black Population',
        marker = list(color = color[1]),
        text = paste("<b>State:</b>", us_sub$State, "<br><b>Sample Population:</b>", prettyNum(us_sub$TotalPop,big.mark=",",scientific=FALSE)),
        hoverinfo = "text+y") %>%
  add_trace(y = ~White.Population..Non.Hispanic., 
            name = 'White Population',
            marker=list(color = color[2])) %>%
  add_trace(y = ~Hispanic.Population, 
            name = 'Hispanic Population',
            marker=list(color = color[3])) %>%
  add_trace(y = ~Asian.Population, 
            name = 'Asian Population',
            marker=list(color = color[4])) %>%
  layout(margin = list(t = 80),
         barmode = 'stack', 
         annotations = list(text = "Racial Breakdown of Urban Areas in Five US States", 
                            showarrow = FALSE, 
                            font = list(size = 19), 
                            x = 0.5, 
                            xref = "paper", 
                            xanchor = "center", 
                            y = 1.2, 
                            yref = "paper"),
         yaxis = list(title = "Percent of Population",
                      ticksuffix = "%",
                      zeroline = FALSE))
States <- c(' NY',' TX',' CA',' FL')
state_us_cities <- us_cities %>% filter(State %in% States)
head(state_us_cities)
##    CBSA                         City State Black.Population
## 1 10180                      Abilene    TX           13,111
## 2 10580      Albany-Schenectady-Troy    NY           64,242
## 3 11100                     Amarillo    TX           15,077
## 4 12420 Austin-Round Rock-Georgetown    TX          138,478
## 5 12540                  Bakersfield    CA           45,024
## 6 13140         Beaumont-Port Arthur    TX           95,980
##   White.Population..Non.Hispanic. Hispanic.Population Asian.Population
## 1                         110,168              38,892            2,704
## 2                         709,495              42,999           37,424
## 3                         159,073              72,974            8,283
## 4                       1,057,470             643,813          108,635
## 5                         310,638             458,907           39,808
## 6                         223,131              58,674           10,474
##   Black.White.Dissimilarity Hispanic.NonHispanic.Dissimilarity
## 1                     0.420                              0.344
## 2                     0.609                              0.351
## 3                     0.603                              0.387
## 4                     0.491                              0.377
## 5                     0.530                              0.463
## 6                     0.650                              0.386
##   Asian.White.Dissimilarity Hispanic.NonHispanic.White..Dissimilarity
## 1                        NA                                     0.379
## 2                     0.467                                     0.413
## 3                        NA                                     0.445
## 4                     0.420                                     0.416
## 5                     0.481                                     0.520
## 6                     0.593                                     0.487
##   Hispanic.White.NonHispanic.Black H.NH.Black
## 1                            0.425      0.397
## 2                            0.446      0.344
## 3                            0.409      0.407
## 4                            0.345      0.335
## 5                            0.499      0.461
## 6                            0.482      0.465
state_us_cities$Black.Population<-as.character(state_us_cities$Black.Population)
state_us_cities$Black.Population<-as.numeric(gsub(',','',state_us_cities$Black.Population))
state_us_cities$White.Population..Non.Hispanic.<-as.character(state_us_cities$White.Population..Non.Hispanic.)
state_us_cities$White.Population..Non.Hispanic.<-as.numeric(gsub(',','',state_us_cities$White.Population..Non.Hispanic.))
state_us_cities$Black.White.Dissimilarity<-as.numeric(state_us_cities$Black.White.Dissimilarity)
head(state_us_cities)
##    CBSA                         City State Black.Population
## 1 10180                      Abilene    TX            13111
## 2 10580      Albany-Schenectady-Troy    NY            64242
## 3 11100                     Amarillo    TX            15077
## 4 12420 Austin-Round Rock-Georgetown    TX           138478
## 5 12540                  Bakersfield    CA            45024
## 6 13140         Beaumont-Port Arthur    TX            95980
##   White.Population..Non.Hispanic. Hispanic.Population Asian.Population
## 1                          110168              38,892            2,704
## 2                          709495              42,999           37,424
## 3                          159073              72,974            8,283
## 4                         1057470             643,813          108,635
## 5                          310638             458,907           39,808
## 6                          223131              58,674           10,474
##   Black.White.Dissimilarity Hispanic.NonHispanic.Dissimilarity
## 1                     0.420                              0.344
## 2                     0.609                              0.351
## 3                     0.603                              0.387
## 4                     0.491                              0.377
## 5                     0.530                              0.463
## 6                     0.650                              0.386
##   Asian.White.Dissimilarity Hispanic.NonHispanic.White..Dissimilarity
## 1                        NA                                     0.379
## 2                     0.467                                     0.413
## 3                        NA                                     0.445
## 4                     0.420                                     0.416
## 5                     0.481                                     0.520
## 6                     0.593                                     0.487
##   Hispanic.White.NonHispanic.Black H.NH.Black
## 1                            0.425      0.397
## 2                            0.446      0.344
## 3                            0.409      0.407
## 4                            0.345      0.335
## 5                            0.499      0.461
## 6                            0.482      0.465
BL <- state_us_cities %>% group_by(State) %>%
  summarise(BLpop = sum(Black.Population), WHpop = sum(White.Population..Non.Hispanic.),BWdis = mean(Black.White.Dissimilarity, na.rm = TRUE)*(BLpop+WHpop))
BL
## # A tibble: 4 x 4
##   State   BLpop    WHpop    BWdis
##   <chr>   <dbl>    <dbl>    <dbl>
## 1 " CA" 2150235 14168253 8479337.
## 2 " FL" 3012609 10443747 6719207.
## 3 " NY"  472360  4114731 2825648.
## 4 " TX" 2952078  9945298 6301980.
BL_pop <- BL$BLpop
BW_dis <- BL$BWdis
WHpop <- BL$WHpop

data <- data.frame(States,BL_pop,BW_dis)

p <- plot_ly(data,x=~States,y=~BL_pop,type='bar',name='BL Population') %>%
  add_trace(y=~BW_dis,name='Black/White Disparity') %>%
  add_trace(y=~WHpop,name='WH Population') %>%
  layout(yaxis = list(title = 'Population'),title='Black/White Disparity', barmode='group')
p